In this notebook we explore the results of a frequent itemset calculation on the encoded notebooks. We encode notebooks using the top-down and bottom-up methods that we have been working with, then gather our buckets and run the fpgrowth algorithm to recognize common patterns
In [70]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
from nbminer.stats.summarize_corpus import SummarizeCorpus
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks[:5]]
a = Features(notebook_objs)
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 5)
fig, axes = plt.subplots(1,2)
axes[0].hist(SummarizeCorpus(a).get_cell_sizes(), bins=25)
axes[0].set_xlabel('Lines of Code')
axes[0].set_ylabel('Number of Occurences')
axes[1].hist(SummarizeCorpus(a).get_top_level(), bins=25)
axes[1].set_xlabel('AST Top Level Nodes')
axes[1].set_ylabel('Number of Occurences')
Out[70]:
In [9]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
In [10]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
In [11]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=True)
fi = FrequentItemsets()
pipe = Pipeline([rbn, agr, gi, fi])
a = pipe.transform(a)
fi_bu_cells = fi
agr_cells = agr
In [12]:
print("Number of Cells: \n", fi_bu_cells.get_number_buckets())
print("Number of Itemsets: \n", fi_bu_cells.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_bu_cells.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_bu_cells.get_avg_number(min_pattern=2))
In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_bu_cells.get_pattern_lengths())
Out[13]:
In [ ]:
In [ ]:
In [25]:
import operator
def get_bar_plot_info(functions):
sorted_functions = sorted(functions.items(), key=operator.itemgetter(1))
x = []
y = []
x_ticks = []
i=0
for el in sorted_functions:
i+=1
x.append(i)
y.append(el[1])
x_ticks.append(el[0])
return x,y,x_ticks
In [15]:
# 1, 0 - label encoder
# 4, 0 - subplots
# 2, 2 - load
pattern = fi_bu_cells.get_patterns(2)[2]
functions = fi_bu_cells.get_function_dict(pattern)
all_functions = fi_bu_cells.get_full_function_dict(pattern)
maxim = fi_bu_cells.get_number_matches(pattern)
plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)
fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)
Out[15]:
In [16]:
import astor
l = pattern #fi_bu_cells.get_patterns(1)[0]
for el in l:
if el == '':
continue
print (astor.to_source(agr_cells.templates.get_random_example(el)))
print(fi_bu_cells.print_itemset_examples(l, 2))
In [27]:
# 1, 0 - label encoder
# 4, 0 - subplots
# 2, 2 - load
patterns = fi_bu_cells.get_patterns(1)
functions = {}
for pattern in patterns:
functions = fi_bu_cells.get_function_dict(pattern, functions)
all_functions = {}
for pattern in patterns:
all_functions = fi_bu_cells.get_full_function_dict(pattern, all_functions)
plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)
plt.bar(x, y, align='center', tick_label=x_ticks)
for i in range(len(x)):
print(x_ticks[i], y[i])
In [ ]:
In [10]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
In [11]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 500)
fi = FrequentItemsets()
pipe = Pipeline([rbn, gi, fe, ke, fi])
a = pipe.transform(a)
fi_td_cells = fi
ke_cells = ke
In [12]:
print("Number of Cells: \n", fi_td_cells.get_number_buckets())
print("Number of Itemsets: \n", fi_td_cells.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_td_cells.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_td_cells.get_avg_number(min_pattern=2))
In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_td_cells.get_pattern_lengths())
Out[13]:
In [80]:
#1, 0
#1, 2 - Read, open
pattern = fi_td_cells.get_patterns(1)[2]
functions = fi_td_cells.get_function_dict(pattern)
all_functions = fi_td_cells.get_full_function_dict(pattern)
maxim = fi_td_cells.get_number_matches(pattern)
plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)
fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)
Out[80]:
In [81]:
import astor
l = pattern #fi_td_cells.get_patterns(1)[0]
for el in l:
if el == '':
continue
print (astor.to_source(ke_cells.templates.get_random_example(el)))
print(fi_td_cells.print_itemset_examples(l, 4))
In [ ]:
In [ ]:
In [ ]:
In [23]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
In [24]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.freq_itemsets.frequent_gram_itemsets import FrequentGramItemsets
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=True)
fi = FrequentItemsets()
pipe = Pipeline([rbn, gi, agr, fi])
a = pipe.transform(a)
agr_4gram = agr
fi_bu_4gram = fi
In [25]:
print("Number of Cells: \n", fi_bu_4gram.get_number_buckets())
print("Number of Itemsets: \n", fi_bu_4gram.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_bu_4gram.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_bu_4gram.get_avg_number(min_pattern=2))
In [26]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_bu_4gram.get_pattern_lengths())
Out[26]:
In [27]:
pattern = fi_bu_4gram.get_patterns()[0]
functions = fi_bu_4gram.get_function_dict(pattern)
all_functions = fi_bu_4gram.get_full_function_dict(pattern)
maxim = fi_bu_4gram.get_number_matches(pattern)
plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)
fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)
Out[27]:
In [28]:
import astor
l = fi_bu_4gram.get_patterns(1)[0]
for el in l:
if el == '':
continue
print (astor.to_source(agr_4gram.templates.get_random_example(el)))
print(fi_bu_4gram.print_itemset_examples(l, 2))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [29]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
In [30]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.freq_itemsets.frequent_gram_itemsets import FrequentGramItemsets
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 500)
fi = FrequentItemsets()
pipe = Pipeline([rbn, gi, fe, ke, fi])
a = pipe.transform(a)
ke_4gram = ke
fi_td_4gram = fi
In [31]:
print("Number of Cells: \n", fi_td_4gram.get_number_buckets())
print("Number of Itemsets: \n", fi_td_4gram.get_number_itemsets(min_pattern=2))
print("Percentage of Cells with Itemsets: \n", fi_td_4gram.get_percentage(min_pattern=2))
print("Average number of Itemsets per Cell (Excluding cells without itemsets):\n", fi_td_4gram.get_avg_number(min_pattern=2))
In [32]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(fi_td_4gram.get_pattern_lengths())
Out[32]:
In [33]:
pattern = fi_td_4gram.get_patterns(1)[0]
functions = fi_td_4gram.get_function_dict(pattern)
all_functions = fi_td_4gram.get_full_function_dict(pattern)
maxim = fi_td_4gram.get_number_matches(pattern)
plt.rcParams['figure.figsize'] = (20, 10)
x, y, x_ticks = get_bar_plot_info(functions)
x2, y2, x_ticks2 = get_bar_plot_info(all_functions)
fig, axes = plt.subplots(2)
axes[0].bar(x, y, align='center', tick_label=x_ticks)
axes[0].axhline(maxim)
axes[1].bar(x2, y2, align='center', tick_label=x_ticks2)
axes[1].axhline(maxim)
Out[33]:
In [66]:
import astor
l = fi_td_4gram.get_patterns(1)[0]
for el in l:
if el == '':
continue
print (astor.to_source(ke_4gram.templates.get_random_example(el)))
print(fi_td_4gram.print_itemset_examples(l, 10))
In [ ]:
In [ ]: